#Housekeeping
date()
## [1] "Tue Nov 8 15:27:58 2022"
Sys.time()
## [1] "2022-11-08 15:27:58 CST"
getwd()
## [1] "/Users/arihangupta/Downloads/PhiCB5_FISH_Analysis"
list.files()
## [1] "10-05.fish.Rmd" "Data"
## [3] "EDA_RNA_fish_data_split.Rmd" "EDA_RNAfish_ilastik.Rmd"
## [5] "Output" "PhiCB5_FISH_Analysis.Rproj"
## [7] "README.md"
.libPaths()
## [1] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library"
.Library
## [1] "/Library/Frameworks/R.framework/Resources/library"
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.3.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.29 R6_2.5.1 jsonlite_1.8.0 magrittr_2.0.3
## [5] evaluate_0.15 stringi_1.7.6 rlang_1.0.6 cli_3.4.1
## [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1 rmarkdown_2.14
## [13] tools_4.2.0 stringr_1.4.0 xfun_0.31 yaml_2.3.5
## [17] fastmap_1.1.0 compiler_4.2.0 htmltools_0.5.2 knitr_1.39
## [21] sass_0.4.1
search()
## [1] ".GlobalEnv" "package:stats" "package:graphics"
## [4] "package:grDevices" "package:utils" "package:datasets"
## [7] "package:methods" "Autoloads" "package:base"
searchpaths()
## [1] ".GlobalEnv"
## [2] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/stats"
## [3] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/graphics"
## [4] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/grDevices"
## [5] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/utils"
## [6] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/datasets"
## [7] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/methods"
## [8] "Autoloads"
## [9] "/Library/Frameworks/R.framework/Resources/library/base"
R.Version()
## $platform
## [1] "aarch64-apple-darwin20"
##
## $arch
## [1] "aarch64"
##
## $os
## [1] "darwin20"
##
## $system
## [1] "aarch64, darwin20"
##
## $status
## [1] ""
##
## $major
## [1] "4"
##
## $minor
## [1] "2.0"
##
## $year
## [1] "2022"
##
## $month
## [1] "04"
##
## $day
## [1] "22"
##
## $`svn rev`
## [1] "82229"
##
## $language
## [1] "R"
##
## $version.string
## [1] "R version 4.2.0 (2022-04-22)"
##
## $nickname
## [1] "Vigorous Calisthenics"
packages <- c("ggplot2", "dplyr", "tidyr", "readxl", "ggpubr", "skimr", "DataExplorer", "tidyverse", "skimr", "svglite", "readxl","tidyxl", "ggforce", "ggpubr", "ggsci", "ggthemes", "ragg", "gplots", "devtools", "readr")
### for this data we need "Biobase", "org.HS.eg.db", "AnnotationDbi" ... but it says these aren't available for this version of R -- they have probably been updated, generally theres an online central database for R and bioconductor especially bioinformatics.
# install.packages(setdiff(packages, rownames(installed.packages())))
#
# update.packages (packages)
for (i in 1:length(packages)) {
library(packages[i], character.only = TRUE)
}
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.7 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'gplots'
##
##
## The following object is masked from 'package:stats':
##
## lowess
##
##
## Loading required package: usethis
data1 <- read.csv("Data/data_table.csv", header=T)
data2 <- read.csv("Data/180-data_table.csv", header=T)
data3 <- read.csv("Data/200-data_table.csv", header=T)
rawinf0 <- read.csv("Data/data_table.csv", header=T)
rawinf180 <- read.csv("Data/180-data_table.csv", header=T)
rawuninf200 <- read.csv("Data/200-data_table.csv", header=T)
means.180.1 <- read.csv("Data/180.1.csv", header=T)
means.180.2 <- read.csv("Data/180.2.csv", header=T)
means.180.3 <- read.csv("Data/180.3.csv", header=T)
means.180.4 <- read.csv("Data/180.4.csv", header=T)
means.180.5 <- read.csv("Data/180.5.csv", header=T)
meanInt180 <- mean(c(means.180.1$Mean, means.180.2$Mean, means.180.3$Mean, means.180.4$Mean, means.180.5$Mean))
means.0.1 <- read.csv("Data/0.1.csv", header=T)
means.0.2 <- read.csv("Data/0.2.csv", header=T)
means.0.3 <- read.csv("Data/0.3.csv", header=T)
means.0.4 <- read.csv("Data/0.4.csv", header=T)
means.0.5 <- read.csv("Data/0.5.csv", header=T)
meanInt0 <- mean(c(means.0.1$Mean, means.0.2$Mean, means.0.3$Mean, means.0.4$Mean, means.0.5$Mean))
means.200.1 <- read.csv("Data/200.1.csv", header=T)
means.200.2 <- read.csv("Data/200.2.csv", header=T)
means.200.3 <- read.csv("Data/200.3.csv", header=T)
means.200.4 <- read.csv("Data/200.4.csv", header=T)
means.200.5 <- read.csv("Data/200.5.csv", header=T)
meanInt200 <- mean(c(means.200.1$Mean, means.200.2$Mean, means.200.3$Mean, means.200.4$Mean, means.200.5$Mean))
looking around
ls() # show me all the data objects that are present
## [1] "data1" "data2" "data3" "i" "meanInt0"
## [6] "meanInt180" "meanInt200" "means.0.1" "means.0.2" "means.0.3"
## [11] "means.0.4" "means.0.5" "means.180.1" "means.180.2" "means.180.3"
## [16] "means.180.4" "means.180.5" "means.200.1" "means.200.2" "means.200.3"
## [21] "means.200.4" "means.200.5" "packages" "rawinf0" "rawinf180"
## [26] "rawuninf200"
# newData1 = as.data.frame(data1$Predicted.Class)
# newData1[,1] = "0"
# newData1 <- cbind(newData1, data1$Size.in.pixels, data1$Mean.Intensity_2, data1$Maximum.intensity_2, data1$Total.Intensity_2)
# colnames(newData1) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt")
# newData1 <- cbind(newData1, ((newData1$MeanInt)/meanInt0), ((newData1$MaxInt)/meanInt0), ((newData1$SumInt)/meanInt0), ((newData1$MeanInt - meanInt0)/meanInt0))
# colnames(newData1) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt", "StandMean", "StandMax", "StandSum", "PercentIncrease")
#
# newinf180 = as.data.frame(data2$Predicted.Class)
# newinf180[,1] = "180"
# newinf180 <- cbind(newinf180, data1$Size.in.pixels, data1$Mean.Intensity_2, data1$Maximum.intensity_2, data1$Total.Intensity_2)
# colnames(newinf180) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt")
# newinf180 <- cbind(newinf180, ((newinf180$MeanInt)/meanInt180), ((newinf180$MaxInt)/meanInt180), ((newinf180$SumInt)/meanInt0), ((newinf180$MeanInt - meanInt180)/meanInt180))
# colnames(newinf180) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt", "StandMean", "StandMax", "StandSum", "PercentIncrease")
#
# newuninf200 = as.data.frame(data3$Predicted.Class)
# newuninf200[,1] = "200"
# newuninf200 <- cbind(newuninf200, uninf200$Size.in.pixels, uninf200$Mean.Intensity_2, uninf200$Maximum.intensity_2, uninf200$Total.Intensity_2)
# colnames(newuninf200) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt")
# newuninf200 <- cbind(newuninf200, ((newuninf200$MeanInt)/meanInt200), ((newuninf200$MaxInt)/meanInt200), ((newuninf200$SumInt)/meanInt0), ((newuninf200$MeanInt - meanInt200)/meanInt200))
# colnames(newuninf200) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt", "StandMean", "StandMax", "StandSum", "PercentIncrease")
inf0 <- select(rawinf0, 'object_id', 'Size.in.pixels', 'Object.Area', 'Variance.of.Intensity_0', 'Variance.of.Intensity_1', 'Variance.of.Intensity_2', 'Mean.Intensity_0', 'Mean.Intensity_1', 'Mean.Intensity_2', 'Maximum.intensity_0', 'Maximum.intensity_1', 'Maximum.intensity_2', 'Minimum.intensity_0', 'Minimum.intensity_1','Minimum.intensity_2', 'Total.Intensity_0', 'Total.Intensity_1','Total.Intensity_2')
inf0 <- mutate(inf0, corrected_mean_intensity_2 = Mean.Intensity_2/meanInt0, normalized_intensity_2 = (Mean.Intensity_2-meanInt0)/meanInt0, normal_Maximum.intensity_2 = Maximum.intensity_2/meanInt0, cond = (rep("infected", nrow(inf0))), time = (rep("0", nrow(inf0))), id = (rep("inf0", nrow(inf0))))
inf0 <- mutate(inf0, cond = as.factor(cond), time = as.factor(time), id = as.factor(id))
inf180 <- select(rawinf180, 'object_id', 'Size.in.pixels', 'Object.Area', 'Variance.of.Intensity_0', 'Variance.of.Intensity_1', 'Variance.of.Intensity_2', 'Mean.Intensity_0', 'Mean.Intensity_1', 'Mean.Intensity_2', 'Maximum.intensity_0', 'Maximum.intensity_1', 'Maximum.intensity_2', 'Minimum.intensity_0', 'Minimum.intensity_1','Minimum.intensity_2', 'Total.Intensity_0', 'Total.Intensity_1','Total.Intensity_2')
inf180 <- mutate(inf180, corrected_mean_intensity_2 = Mean.Intensity_2/meanInt180, normalized_intensity_2 = (Mean.Intensity_2-meanInt180)/meanInt180, normal_Maximum.intensity_2 = Maximum.intensity_2/meanInt180, cond = (rep("infected", nrow(inf180))), time = (rep("180", nrow(inf180))), id = (rep("inf180", nrow(inf180))))
inf180 <- mutate(inf180, cond = as.factor(cond), time = as.factor(time), id = as.factor(id))
uninf200 <- select(rawuninf200, 'object_id', 'Size.in.pixels', 'Object.Area', 'Variance.of.Intensity_0', 'Variance.of.Intensity_1', 'Variance.of.Intensity_2', 'Mean.Intensity_0', 'Mean.Intensity_1', 'Mean.Intensity_2', 'Maximum.intensity_0', 'Maximum.intensity_1', 'Maximum.intensity_2', 'Minimum.intensity_0', 'Minimum.intensity_1','Minimum.intensity_2', 'Total.Intensity_0', 'Total.Intensity_1','Total.Intensity_2')
uninf200 <- mutate(uninf200, corrected_mean_intensity_2 = Mean.Intensity_2/meanInt200, normalized_intensity_2 = ((Mean.Intensity_2-meanInt200)/meanInt200), normal_Maximum.intensity_2 = Maximum.intensity_2/meanInt200, cond = (rep("uninfected", nrow(uninf200))), time = (rep("200", nrow(uninf200))), id = (rep("uninf200", nrow(uninf200))))
uninf200 <- mutate(uninf200, cond = as.factor(cond), time = as.factor(time), id = as.factor(id))
data <- rbind(inf0, inf180, uninf200)
for a more formal up to date review when workign with big datasets
introduce(data)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 563 24 3 21 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 563 13512 107104
head(data)
## object_id Size.in.pixels Object.Area Variance.of.Intensity_0
## 1 0 239 239 1478288.0
## 2 1 91 91 2231988.0
## 3 2 159 159 2616356.2
## 4 3 98 98 935724.8
## 5 4 249 249 1715539.1
## 6 5 224 224 2386803.0
## Variance.of.Intensity_1 Variance.of.Intensity_2 Mean.Intensity_0
## 1 11997313 214094.5 8826.088
## 2 13073474 112568.1 9129.352
## 3 14619022 225386.5 8855.333
## 4 3687060 190662.2 8815.582
## 5 26412804 134157.7 9106.727
## 6 27148668 136859.2 8867.353
## Mean.Intensity_1 Mean.Intensity_2 Maximum.intensity_0 Maximum.intensity_1
## 1 26949.00 4400.092 13281 33802
## 2 21995.25 4153.989 12953 27626
## 3 22669.25 4349.692 12823 29080
## 4 25786.57 4429.541 11646 30352
## 5 29540.59 4318.852 13282 38405
## 6 26727.04 4162.294 12724 37138
## Maximum.intensity_2 Minimum.intensity_0 Minimum.intensity_1
## 1 6577 6922 16460
## 2 5167 6522 13328
## 3 5971 6282 12429
## 4 5761 7479 20910
## 5 5291 6333 14575
## 6 5249 6602 14100
## Minimum.intensity_2 Total.Intensity_0 Total.Intensity_1 Total.Intensity_2
## 1 3265 2109435 6440810 1051622
## 2 3330 830771 2001568 378013
## 3 3404 1407998 3604411 691601
## 4 3502 863927 2527084 434095
## 5 3295 2267575 7355606 1075394
## 6 3264 1986287 5986857 932354
## corrected_mean_intensity_2 normalized_intensity_2 normal_Maximum.intensity_2
## 1 1.205718 0.2057181 1.802237
## 2 1.138281 0.1382806 1.415867
## 3 1.191907 0.1919074 1.636180
## 4 1.213788 0.2137877 1.578636
## 5 1.183456 0.1834564 1.449846
## 6 1.140556 0.1405565 1.438337
## cond time id
## 1 infected 0 inf0
## 2 infected 0 inf0
## 3 infected 0 inf0
## 4 infected 0 inf0
## 5 infected 0 inf0
## 6 infected 0 inf0
tail(data)
## object_id Size.in.pixels Object.Area Variance.of.Intensity_0
## 558 220 69 69 2347364
## 559 221 73 73 3457264
## 560 222 130 130 1647870
## 561 223 28 28 1886626
## 562 224 103 103 1794572
## 563 225 74 74 2203610
## Variance.of.Intensity_1 Variance.of.Intensity_2 Mean.Intensity_0
## 558 8144874 97106.73 9209.782
## 559 11581435 101303.98 9778.425
## 560 2082476 102143.75 9734.077
## 561 7466740 122256.81 9816.107
## 562 7599682 116769.94 9702.961
## 563 5724328 89278.86 8773.459
## Mean.Intensity_1 Mean.Intensity_2 Maximum.intensity_0 Maximum.intensity_1
## 558 20429.78 3944.246 14328 25915
## 559 20609.82 4066.137 14563 26505
## 560 16941.14 3900.262 13881 20086
## 561 20377.14 4090.893 13365 25563
## 562 18169.52 4030.942 13497 24322
## 563 17587.12 3874.811 12302 22413
## Maximum.intensity_2 Minimum.intensity_0 Minimum.intensity_1
## 558 4628 7196 13753
## 559 4923 7383 13725
## 560 4885 7761 13263
## 561 4968 7448 15390
## 562 4974 7611 12970
## 563 4585 6345 13212
## Minimum.intensity_2 Total.Intensity_0 Total.Intensity_1 Total.Intensity_2
## 558 3110 635475 1409655 272153
## 559 3056 713825 1504517 296828
## 560 3030 1265430 2202348 507034
## 561 3464 274851 570560 114545
## 562 3307 999405 1871461 415187
## 563 3381 649236 1301447 286736
## corrected_mean_intensity_2 normalized_intensity_2
## 558 1.080035 0.08003535
## 559 1.113412 0.11341213
## 560 1.067991 0.06799119
## 561 1.120191 0.12019090
## 562 1.103775 0.10377476
## 563 1.061022 0.06102214
## normal_Maximum.intensity_2 cond time id
## 558 1.267265 uninfected 200 uninf200
## 559 1.348043 uninfected 200 uninf200
## 560 1.337638 uninfected 200 uninf200
## 561 1.360365 uninfected 200 uninf200
## 562 1.362008 uninfected 200 uninf200
## 563 1.255490 uninfected 200 uninf200
# Relevant Data to us is Size in Pixels, maximum Intensity, Minimum Intensity, Mean Intensity, Variance of Intensity. personally I am going to remake the dataframe with these parameters.
introduce(data)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 563 24 3 21 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 563 13512 107104
## Sometimes real word data can be really messy, another good thing to do can be to plot the missing data as you will see below. This allows us to check the data, even though introduce() tells us numebr of missing values, we want to see where those values are, or if there are none at all.
plot_missing(data) # none will show up because no data is missing
#
# table(FACTORVARIABLES, useNA = "ifany" # it can be useful to put factors in a table, none here though, include NAs to check for NA
#
#
# ###--- look for missing values --- #####
#
#
# # is.na checks for NA values
# table(is.na(GFP[1])) # use row or column numbers [] with these if you onyl want to take some columns because they need the structure of the df versus a string call using the $
#
# # Make the distribution of NA's by columns/measurements
# sample_na = rowSums(is.na(data))
# table(sample_na)
#
# # Make the distribution of NA's by samples
# measurement_na = colSums(is.na(data))
# table(measurement_na)
#
# #### Make sure dimensions match up #####
# identical(data$Size.in.pixels, data$Object.Area) ##check if they are identical colnames and the ids if you have multiple tables (subject information)
### Take a look at the distribution of all the variables ###
inf0hist_data <-plot_histogram(inf0, title = "Inf0")
inf180hist_data <-plot_histogram(inf180, title = "inf180")
uninf200hist_data <-plot_histogram(uninf200, title = "uninf200")
### Quantile-Quantile plots help us visualize deviations away from a specific probability distribution, after analyzing these plots we may have to apply transformations as you have mentioned earlier (such as a log) to do some type of linear regression. Default for the plot line is the normal distribution.
qqplot_inf0 <-plot_qq(inf0, title = "Inf0")
qqplot_inf180 <-plot_qq(inf180, title = "Inf180")
qqplot_uninf200 <-plot_qq(uninf200, title = "Uninf200")
# Slicing and dicing can be crucial to analysis and see where you get variations. If you want to predict an outcome, it is good to look at all the features based on that value.
# Minint_boxplot_inf0 <- plot_boxplot(inf0, by = 'Minimum.intensity_0')
# Minint_boxplot_inf0 <- plot_boxplot(inf0, by = 'Minimum.intensity_0')
# Minint_boxplot_inf0 <- plot_boxplot(inf0, by = 'Minimum.intensity_0')
inf0_pca <- drop_columns(inf0, "object_id")
inf0_pca_plot <- plot_prcomp(inf0_pca)
## Warning in plot_prcomp(inf0_pca): The following features are dropped due to zero variance:
## * cond_infected
## * time_0
## * id_inf0
data_pca <- drop_columns(data, c("object_id", "Size.in.pixels"))
data_pca_plot <- plot_prcomp(data)
you can answer in normal text, I cannot remember what that does in the knit function we will see.
so right now our goal is to simply get a snapshot of the data, see if anything needs to be cleaned, get an idea of the distributions and how they compare with one another. We use what we know about the data to check if the data is picking up what we know is there before we ask questions of the data we do not know the answer to .
Why is this important?
# masterData <- rbind(newData1, newinf180, newuninf200)
# p1 <- ggplot(data = masterData, mapping = aes(x= MaxInt, color = ID)) + geom_histogram(binwidth = 200)+ labs(x = "Max")
# p1
# p2 <- ggplot(data = masterData, mapping = aes(x= MeanInt, color = ID)) + geom_histogram(binwidth = 200)+ labs(x = "Mean")
# p2
# p3 <- ggplot(data = masterData, mapping = aes(x= SumInt, color = ID)) + geom_histogram(bins = 63)+ labs(x = "Sum")
# p3
# p3 <- ggplot(data = masterData, mapping = aes(x= Size, color = ID)) + geom_histogram(bins = 63)+ labs(x = "Size")
# p3
# p4 <- ggplot(data = masterData, mapping = aes(x= Size, y = SumInt ,color = ID)) + geom_point() + labs(x = "Size", y = "Sum of Intensity")
# p4
# p5 <- ggplot(data = newinf180, mapping = aes(x= SumInt, color = ID)) + geom_histogram(bins = 200)+ labs(x = "Sum")
# p5
# p6 <- ggplot(data = newinf180, mapping = aes(x= MaxInt, color = ID)) + geom_histogram(bins = 60)+ labs(x = "Max")
# p6
# p7 <- ggplot(data = masterData, mapping = aes(x= Size, y = StandSum ,color = ID)) + geom_point() + labs(x = "Size", y = "Standardized sum")
# p7
# p8 <- ggplot(masterData, aes(ID,SumInt)) + geom_boxplot() + labs(x = "Sum")
# p8
# p9 <- ggplot(data = masterData, mapping = aes(x= StandMean, color = ID)) + geom_histogram(bins = 75)+ labs(x = "Standardized Mean")
# p9
# p10 <- ggplot(data = masterData, mapping = aes(x= StandMax, color = ID)) + geom_histogram(bins = 75)+ labs(x = "Standardized Max")
# p10
# p11 <- ggplot(data = masterData, mapping = aes(x= StandSum, color = ID)) + geom_histogram(bins = 75)+ labs(x = "Standardized Sum")
# p11
# p12 <- ggplot(data = masterData, mapping = aes(x= PercentIncrease, color = ID)) + geom_histogram(bins = 75)+ labs(x = "Standardized Sum")
# p12
#For control data
# pControl0minscatter <- ggplot(data = newData1, mapping = aes(x= Size, y = SumInt ,color = ID)) + geom_point() + labs(x = "Size", y = "Sum of Intensity")
# pControl0minscatter
#
# pControl180minscatter <- ggplot(data = newuninf200, mapping = aes(x= Size, y = SumInt ,color = ID)) + geom_point() + labs(x = "Size", y = "Sum of Intensity")
# pControl180minscatter